-U scikit-learn
pip install -U kaggle
pip install -U kagglehub pip install
Classification Metrics Explained | Sensitivity, Precision, AUROC, & More
install pacakge
load pacakge
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
#from kaggle.api.kaggle_api_extended import KaggleApi
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
precision_score, recall_score, roc_curve,
accuracy_score, f1_score, roc_auc_score,
average_precision_score, confusion_matrix,
precision_recall_curve )
download data from kaggle
import kagglehub
# Download latest version
"uciml/pima-indians-diabetes-database")
kagglehub.dataset_download(= kagglehub.dataset_download("uciml/pima-indians-diabetes-database")
path print("Path to dataset files:", path)
Path to dataset files: /Users/jinchaoduan/.cache/kagglehub/datasets/uciml/pima-indians-diabetes-database/versions/1
show data file under download folder
import os
os.listdir(path)
['diabetes.csv']
read data
= pd.read_csv(path+'/'+os.listdir(path)[0])
df df.head()
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
df.Outcome.value_counts()
Outcome
0 500
1 268
Name: count, dtype: int64
# separate features from response
= df.drop('Outcome', axis=1)
X = df['Outcome'] y
# split data into test and training sets
= train_test_split(X, y, test_size=0.2, random_state=42) X_train, X_test, y_train, y_test
# initialize and train logistic regression model
= LogisticRegression(max_iter=1000)
model model.fit(X_train, y_train)
LogisticRegression(max_iter=1000)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(max_iter=1000)
# predict on the test set and get the probas
= model.predict(X_test)
y_pred = model.predict_proba(X_test)[:, 1] y_pred_proba
# quickly look at the distribution of the probas
= np.percentile(y_pred_proba, [5, 25, 50, 75, 95])
percentiles percentiles
array([0.03455652, 0.11989883, 0.29954411, 0.64776581, 0.87083353])
confusion matrix
# generate confusion matrix
= confusion_matrix(y_test, y_pred)
cm
=(8, 6))
plt.figure(figsize=True, fmt='d', cmap='Blues', cbar=False)
sns.heatmap(cm, annot'Confusion Matrix')
plt.title('Predicted Labels')
plt.xlabel('True Labels')
plt.ylabel(0.5, 1.5], ['No Diabetes', 'Diabetes'])
plt.xticks([0.5, 1.5], ['No Diabetes', 'Diabetes'], va='center')
plt.yticks([ plt.show()
# recall / sensitivity
= recall_score(y_test, y_pred)
recall recall
0.6727272727272727
# precision / positive predictive value
= precision_score(y_test, y_pred)
precision precision
0.6379310344827587
# specificity
= confusion_matrix(y_test, y_pred).ravel()
tn, fp, fn, tp = tn / (tn + fp)
specificity specificity
np.float64(0.7878787878787878)
# accuracy
= accuracy_score(y_test, y_pred)
accuracy accuracy
0.7467532467532467
# f1
= f1_score(y_test, y_pred)
f1 f1
0.6548672566371682
# get ROC curve values
= roc_curve(y_test, y_pred_proba)
fpr, tpr, thresholds_roc
# get PR curve values
= precision_recall_curve(y_test, y_pred_proba)
precision, recall, thresholds_pr
# get areas under the curves
= roc_auc_score(y_test, y_pred_proba)
auroc = average_precision_score(y_test, y_pred_proba) pr_auc
# plot both curves
= plt.subplots(1, 2, figsize=(16, 6))
fig, (ax1, ax2) ='darkorange', lw=2, label=f'AUC = {auroc:.2f}')
ax1.plot(fpr, tpr, color0, 1], [0, 1], color='navy', lw=2, linestyle='--')
ax1.plot(['False Positive Rate')
ax1.set_xlabel('True Positive Rate')
ax1.set_ylabel('Receiver Operating Characteristic (ROC) Curve')
ax1.set_title(="lower right")
ax1.legend(loc
# Plot Precision-Recall Curve
='purple', lw=2, label=f'PR-AUC = {pr_auc:.2f}')
ax2.plot(recall, precision, color'Recall')
ax2.set_xlabel('Precision')
ax2.set_ylabel('Precision-Recall Curve')
ax2.set_title(="lower left")
ax2.legend(loc
plt.show()
y_test.value_counts()
Outcome
0 99
1 55
Name: count, dtype: int64
Reference
https://www.youtube.com/watch?v=KdUrfY1yM0w
https://github.com/RichardOnData/YouTube/blob/main/Python%20Notebooks/classification_metrics.ipynb